Importing Libraries¶

In [111]:
# Import libraries
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import figure
figure(figsize=(5, 20), dpi=300)
from sklearn import preprocessing
from sklearn import manifold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.metrics import classification_report
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from plotnine import ggplot, aes, geom_line, geom_abline, ggtitle, xlab, ylab
from sklearn.preprocessing import MinMaxScaler
import random 
from sklearn import metrics
import plotly.express as px
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')
<Figure size 1500x6000 with 0 Axes>

Importing Dataset and Merging¶

In [112]:
# Load data
features = pd.read_csv("Features.csv")
labels = pd.read_csv("Target.CSV")

Data Pre-Processing¶


Concatenating columnwise


In [113]:
data = pd.concat([features, labels], axis=1)

Replacing and Eliminating certain class value


In [114]:
data['BlcaGrade'] = data['BlcaGrade'].replace('Grade I', 'Stage I')
data['BlcaGrade'] = data['BlcaGrade'].replace('Grade II', 'Stage II')
data['BlcaGrade'] = data['BlcaGrade'].replace('Grade III', 'Stage III')
data['BlcaGrade'] = data['BlcaGrade'].replace('Grade IV', 'Stage IV')
# Replacing
data['BlcaGrade'] = data['BlcaGrade'].replace('Stage I', 'Stage II')
# Eliminating
data = data.drop(data[(data['BlcaGrade'] == 'Stage II')].index)
data['BlcaGrade'].value_counts()
label_names = ['Stage III', 'Stage IV']
In [115]:
data['BlcaGrade'].value_counts()
Out[115]:
Stage III    148
Stage IV     142
Name: BlcaGrade, dtype: int64

Encoding Labels


0: Stage III
1: Stage IV
In [116]:
# label_encoder
label_encoder = preprocessing.LabelEncoder()
data['BlcaGrade']= label_encoder.fit_transform(data['BlcaGrade'])
data['BlcaGrade'].value_counts()
Out[116]:
0    148
1    142
Name: BlcaGrade, dtype: int64

Duplicate entry checking


In [117]:
# Outputs the total number of rows in the dataframe.
print("Total entries: ", len(data))
# 'duplicates = df.duplicated()' uses the duplicated method to create a boolean series indicating whether each row is a duplicate or not.
duplicates = data.duplicated()
# 'duplicate_rows = df[duplicates]' uses the boolean series to index the dataframe and obtain a sub-dataframe containing only the duplicate rows.
duplicate_rows = data[duplicates]
# Outputs the number of duplicate rows in the sub-dataframe.
print("Duplicate entries: ", len(duplicate_rows))
Total entries:  290
Duplicate entries:  0

NULL entry checking


In [185]:
print("NULL Entries: ", data.isnull().sum().sum())
NULL Entries:  0

t-SNE¶

In [7]:
# dimensionality reduction using t-SNE
tsne = manifold.TSNE(n_components=2, random_state=42)
# fit and transform
mnist_tr = tsne.fit_transform(data.drop('BlcaGrade',axis=1))
# create dataframe
cps_df = pd.DataFrame(columns=['Component 1', 'Component 2', 'target'], data=np.column_stack((mnist_tr, data['BlcaGrade'])))
# cast targets column to int
cps_df.loc[:, 'target'] = cps_df.target.astype(int)

fig = px.scatter(
    cps_df, x='Component 1', y='Component 2',
    color=cps_df.target.astype(str), labels={'color': 'Target Variable'}, width=600, height=400, title="Component 1 VS Component 2 with respect to their labels")
fig.show()
−20−10010−10010
Target Variable01Component 1 VS Component 2 with respect to their labelsComponent 1Component 2
plotly-logomark

Reducing features with Mutual Info¶

In [118]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

X = data.drop('BlcaGrade',axis=1)
y = data['BlcaGrade']

mutual_info = mutual_info_classif(X, y,random_state=0)
mutual_info = pd.Series(mutual_info)
best_cols = SelectKBest(mutual_info_classif, k=100)
best_cols.fit(X, y)
print((X.columns[best_cols.get_support()]))

reducedFeatures = pd.DataFrame(X.columns[best_cols.get_support()])
reducedFeatures.to_csv("ReducedFeatures.csv")

selectedFeatures = list(X.columns[best_cols.get_support()])
Index(['GDE1', 'CX3CL1', 'BARX2', 'NFYC', 'ISOC1', 'KLF6', 'SREBF1', 'FGFR1',
       'TP53INP2', 'PLOD1', 'FUS', 'LTBP4', 'PES1', 'FAM83D', 'TRADD', 'CA2',
       'SLC1A6', 'SLC1A5', 'KDELR1', 'MRPL27', 'SLC9A3R1', 'NMU', 'VPS29',
       'LTBR', 'FAM162A', 'TNNC1', 'EFEMP1', 'CD207', 'CXCR4', 'NR4A1',
       'PPDPF', 'DOHH', 'TNNI2', 'BARX1', 'HSPBP1', 'NARS1', 'ELF5', 'PRADC1',
       'IL36RN', 'TMPRSS4', 'DUSP5', 'IMPA2', 'NARF', 'PSMA5', 'HSPD1',
       'CPLX2', 'ERLIN2', 'CRH', 'GSN', 'EIF4EBP2', 'CAPN5', 'COMMD7',
       'C11orf53', 'XAGE2', 'UBE2L6', 'SLC38A10', 'CLSTN2', 'PRAC1', 'HK2',
       'TLCD1', 'METAP1', 'GEM', 'MRPL16', 'ANGPTL4', 'LRRC45', 'SLC16A5',
       'RPS9', 'CXXC5', 'BSG', 'CHCHD1', 'MRPL57', 'TOMM20', 'PLAAT3', 'RRS1',
       'MYLPF', 'METTL7A', 'TNFRSF18', 'AKR1C3', 'DIO3', 'SPANXC', 'RPL10A',
       'SNORD14E', 'PSMB8', 'MT-RNR1', 'IGKV3D-20', 'IGLV4-60', 'IGHV3-20',
       'EMP2', 'PGM5-AS1', 'AC113935.1', 'Y_RNA.5', 'AC004080.1', 'GJA5',
       'IGHV3-30', 'MMP28', 'AL035661.1', 'AL161431.1', 'H2AC8', 'MRPL45',
       'AC090498.1'],
      dtype='object')
In [188]:
score = list(best_cols.scores_)
score.sort(reverse=True)
score[:100]
# Figure Size
fig = plt.figure(figsize =(10, 7))
# Horizontal Bar Plot
plt.bar(best_cols.get_feature_names_out(X.columns)[:20], score[:20])
plt.xticks(rotation = 45)
# Show Plot
plt.show()

Split into features and target variable¶

In [119]:
X = data[selectedFeatures].values
y = data['BlcaGrade'].values

Defining Outer CV¶

In [120]:
k = len(y)/3 # Define the split size of outer cv here
k = int(k)

# Train 1, Train 2, Test 3 - Outer CV 1
outerFold_features_1 = X[:k]
outerFold_labels_1 = y[:k]
# Train 1, Test 2, Train 3 - Outer CV 2
outerFold_features_2 = X[k:2*k]
outerFold_labels_2 = y[k:2*k]
# Test 1, Train 2, Train 3 - Outer CV 3
outerFold_features_3 = X[2*k:(3*k)+1]
outerFold_labels_3 = y[2*k:(3*k)+1]

# Training Features and Labels for 1st Outer CV
features_1 = np.concatenate([outerFold_features_1, outerFold_features_2])
label_1 = np.concatenate([outerFold_labels_1, outerFold_labels_2])
# Training Features and Labels for 2nd Outer CV
features_2 = np.concatenate([outerFold_features_1, outerFold_features_3])
label_2 = np.concatenate([outerFold_labels_1, outerFold_labels_3])
# Training Features and Labels for 3rd Outer CV
features_3 = np.concatenate([outerFold_features_2, outerFold_features_3])
label_3 = np.concatenate([outerFold_labels_2, outerFold_labels_3])

Function Definitions¶

In [121]:
# ---------------------------------------------------------------------------------------------------------------
# This function prints a formatted string to the console with information about the current iteration in inner CV
# ---------------------------------------------------------------------------------------------------------------
def disp(count, feature, p1, p2, trainResult, testResult, selected, clf_arguments1, clf_arguments2):
  ''' This function prints a formatted string to the console with information about the current iteration in inner CV

      args: (9 arguments)
        count - Iteration count
        feature - Particularly the best feature
        p1 - One of the parameter value for classifier
        p2 - Another parameter value for classifier
        trainResult - Mean training accuracy of a particular iteration in inner cv
        testResult - Mean test accuracy of a particular iteration in innver cv
        selected - Global variable reference 'featuresOuterFold'; It stores the best features as the iteration goes on for inner cv
        clf_arguments1 - Name for one of the parameter of classifier
        clf_arguments2 - Name for another parameter of classifier

      Returns:
        No return value
  '''
  print("Iteration " + str(count) + " >> Feature: " + str(feature) + "; " + clf_arguments1 + ": " + str(p1) + "; " + clf_arguments2 + ": " + str(p2) + "; Train Accuracy: " + str(round(trainResult, 4)) + "; Test Accuracy: " + str(round(testResult, 4)) + "; Selected Features: " + str(selected))
  print("---------------------------------------------------------------------------------------------------------------------------------")
# ----------------------------------------------------------------
# This function is used to perform inner cross-validation with FFS 
# ----------------------------------------------------------------
def innerCV(count, features, param1, param2, X, y, cv, clf, clf_arguments1, clf_arguments2):
  '''
    This function is used to perform inner cross-validation on a machine learning algorithm, where two parameters are being optimized.
    The function loops through all possible combinations of parameter values and feature indices.
    
    args: 
      count - Iteration count
      feautres - Reduced features from Mutual Information
      param1 - One of the parameter value for classifier
      param2 - Another parameter value for classifier
      X - Features of a particular outerfold 
      y - Target Variable of a particular outerfold 
      cv - K value for inner cv
      clf - Class name of classifier
      clf_arguments1 - Name for one of the parameter of classifier
      clf_arguments2 - Name for another parameter of classifier
    
    Returns:
      feature - The best feature index found in the loop
  '''

  temp = 0.0

  for i in features:
    for j in param1:
      for k in param2:
        # This line creates a dictionary called args containing the two parameter values being tested in this iteration of the loop
        args = {clf_arguments1:j, clf_arguments2:k}
        # This line uses scikit-learn's cross_validate function to perform cross-validation on the machine learning algorithm being tested (clf) 
        # It passes in the args dictionary as the parameters to the algorithm
        scores = cross_validate(clf(**args), X[:, i - 1].reshape(-1, 1), y, cv=cv, return_train_score=True)
        # This line checks if the current test score is greater than or equal to the current best test score 
        if temp <= (float)(scores['test_score'].mean()):
          # This line updates the best test score to be the current test score
          temp = (float)(scores['test_score'].mean())
          # This line stores the mean training score for the current iteration
          trainResult = scores['train_score'].mean()
          # This line stores the mean test score for the current iteration
          testResult = scores['test_score'].mean()
          # Select the best feature in each iteration
          feature = i
          # Select the best parameters in each iteration
          p1 = j
          # Select the best parameters in each iteration
          p2 = k
  # This line appends the current best feature to a global list
  featuresOuterFold.append(feature)
  # This line calls a function called disp to print out information about the current iteration of the program
  disp(count, feature, p1, p2, trainResult, testResult, featuresOuterFold, clf_arguments1, clf_arguments2)
  # This line returns the best feature index found in the loop
  return feature
# ------------------------------------------------------------------------------
# This function implements the outer loop of the nested cross-validation process 
# ------------------------------------------------------------------------------
def outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, X, y):
  '''
  A function to perform the outer fold cross-validation by iterating over the features and hyperparameters to find the best combination of both. 
  The function takes the following arguments:
  
  args:
    clf - Class name of classifier 
    clf_arguments1 - Name for one of the parameter of classifier
    clf_arguments2 - Name for another parameter of classifier
    param1 - One of the parameter value for classifier
    param2 - Another parameter value for classifier
    X - Features of a particular outerfold 
    y - Target Variable of a particular outerfold 
  
  Returns:
    No return value
  '''
  ## Create a list of reduced feature numbers from 1 to the total number of selected features
  features = [i+1 for i in range(len(selectedFeatures))]
  # Remove the last parameter value from params1 list and assign it to param1 variable
  param1 = params1.pop()
  # Remove the last parameter value from params2 list and assign it to param2 variable
  param2 = params2.pop()

  # Loop through the outer fold cross-validation iterations, from 1 to 5 (inclusive)
  for i in range(1, 6):
    if i == 1:
      # Call the innerCV function with the first set of features and hyperparameters
      feature = innerCV(i, features, param1, param2, X, y, 5, clf, clf_arguments1, clf_arguments2)
    else:
      # Remove the previously selected feature from the features list
      features.remove(feature)
      # Remove the last parameter value from params1 list and assign it to param1 variable
      param1 = params1.pop()
      # Remove the last parameter value from params2 list and assign it to param2 variable
      param2 = params2.pop()
      # Call the innerCV function with the updated set of features and hyperparameters
      # Store the best feature in the feature variable
      feature = innerCV(i, features, param1, param2, X, y, 5, clf, clf_arguments1, clf_arguments2)
In [122]:
# -------------------------------------------------------------------------------------------------------
# This function is used to evaluate the performance of a classifier on the outer fold of cross-validation 
# -------------------------------------------------------------------------------------------------------
def evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, selectedF, X, y, outerFold_features, outerFold_labels, label_names):
  '''
  This function is used to evaluate the performance of a classifier on the outer fold of cross-validation.

  args:
    clf_name: a string specifying the name of the classifier being used
    clf: the name of the classifier class
    clf_arguments1: a string specifying the name of one of the parameters of the classifier
    clf_arguments2: a string specifying the name of another parameter of the classifier
    param1: the value for the first parameter of the classifier
    param2: the value for the second parameter of the classifier
    selectedF: Index list of top 5 features from nested FFS on outer CV training data
    X: Values of top 5 features in the dataset
    y: Corresponding target variable of the top 5 features
    outerFold_features: Unseen feature data from outer fold
    outerFold_labels: Unseen corresponding target variable from outer fold
    label_names: a list of two strings specifying the names of the two classes in the target variable for plot of confusion matrix
  
  Returns:
    No return value
  '''
  # Mapping features from list to array - Array starts with 0 but list doesn't
  selectedF = [x - 1 for x in featuresOuterFold]
  # -----------------------------------------------------------------------------------------
  # Training classifier with the parameters and features with best accuracy found in inner CV
  # -----------------------------------------------------------------------------------------
  '''
  This block of code checks if the classifier is SVM, and if it is, it sets probability to True while creating a new instance of the classifier.
  It creates a dictionary of the classifier arguments (clf_arguments1 and clf_arguments2) and their corresponding values (param1 and param2),
  and passes them as keyword arguments to the classifier function, creating an instance of the classifier.
  If the classifier is not SVM, it creates a new instance of the classifier using the same dictionary of arguments without setting the probability parameter.
  Finally, it fits the created classifier instance on the selected features (selectedF) of the training data (X) and their corresponding labels (y).
  '''
  if clf_name == 'SVM':
    args = {clf_arguments1:param1, clf_arguments2:param2}
    classifier = clf(**args, probability=True)
  else:
    args = {clf_arguments1:param1, clf_arguments2:param2}
    classifier = clf(**args)
  classifier.fit(X[:, selectedF], y)
  print("--------------------------------------------------------------------------------------------------")
  print("Training Score on outer fold:", round(classifier.score(X[:, selectedF], y), 6) * 100)
  print("--------------------------------------------------------------------------------------------------")
  # --------------------------------------------------------------------------------------------------
  # Testing on Outer CV
  # --------------------------------------------------------------------------------------------------
  y_predict = classifier.predict(outerFold_features[:, selectedF])
  print("Test Accuracy on outer fold:", round(accuracy_score(outerFold_labels, y_predict), 6) * 100)
  print("--------------------------------------------------------------------------------------------------")
  print(clf_arguments1 + ": " + str(param1))
  print(clf_arguments2 + ": " + str(param2))
  print("Features: ", featuresOuterFold)
  print("--------------------------------------------------------------------------------------------------")
  # --------------------------------------------------------------------------------------------------
  # Classification Report
  # --------------------------------------------------------------------------------------------------
  print(classification_report(outerFold_labels, y_predict, digits=6))
  print("--------------------------------------------------------------------------------------------------")
  # --------------------------------------------------------------------------------------------------
  # ROC AUC Curve and Score
  # --------------------------------------------------------------------------------------------------
  # Generating a list of 0 with the length of outerFold_labels
  ns_probs = [0 for _ in range(len(outerFold_labels))]
  # Predicting class probabilities for the test set using the classifier
  lr_probs = classifier.predict_proba(outerFold_features[:, selectedF])
  # Selecting only the probabilities for the positive class
  lr_probs = lr_probs[:, 1]
  # Calculating the ROC AUC score for a model that predicts only 0's
  ns_auc = roc_auc_score(outerFold_labels, ns_probs)
  # Calculating the ROC AUC score for the classifier
  lr_auc = roc_auc_score(outerFold_labels, lr_probs)
  # Print the no skill ROC AUC score and the ROC AUC score of the classifier
  print('No Skill: ROC AUC=%.3f' % (ns_auc))
  print(clf_name + ': ROC AUC=%.3f' % (lr_auc))
  print("--------------------------------------------------------------------------------------------------")

  # Compute the false positive rate, true positive rate and thresholds for the no skill model and the classifier
  ns_fpr, ns_tpr, _ = roc_curve(outerFold_labels, ns_probs)
  lr_fpr, lr_tpr, _ = roc_curve(outerFold_labels, lr_probs)

  # Plot the ROC curves for the no skill model and the classifier
  plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
  plt.plot(lr_fpr, lr_tpr, marker='.', label=clf_name)
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  plt.legend()
  plt.title(clf_name)
  plt.show()
  print("--------------------------------------------------------------------------------------------------")
  print("Confusion Matrix")
  print("--------------------------------------------------------------------------------------------------")
  ax= plt.subplot()
  plt.title(clf_name)
  metrics.ConfusionMatrixDisplay(
  confusion_matrix = metrics.confusion_matrix(outerFold_labels, y_predict), display_labels = [label_names[0], label_names[1]]).plot(ax=ax, cmap=plt.cm.Greens);
  # ------------------------------------------------------------------------------------------------------------
  # These lines append various metrics and information to global variables to be used later for further analysis
  # ------------------------------------------------------------------------------------------------------------
  param_1.append(param1)
  param_2.append(param2)
  trainScore.append(round(classifier.score(X[:, selectedF], y), 6) * 100)
  testScore.append(round(accuracy_score(outerFold_labels, y_predict), 6) * 100)
  falsePositiveRate.append(lr_fpr)
  truePositiveRate.append(lr_tpr)
  aucScore.append(lr_auc)
  featureSubset.append(featuresOuterFold)
In [123]:
def combinedROCPlot(clf_name, featureSubset, param_1, param_2, trainScore, testScore, aucScore, falsePositiveRate, truePositiveRate):
  '''
  This is a function that plots a combined ROC curve for a given classifier over multiple outer folds of cross-validation. 
  The function takes in the following parameters:
  
  args:
    clf_name: the name of the classifier being used
    featureSubset: a list containing the selected feature subset for each outer fold
    param_1: a list containing the value of the first hyperparameter for the classifier for each outer fold
    param_2: a list containing the value of the second hyperparameter for the classifier for each outer fold
    trainScore: a list containing the training score for the classifier for each outer fold
    testScore: a list containing the testing score for the classifier for each outer fold
    aucScore: a list containing the AUC score for the classifier for each outer fold
    falsePositiveRate: a list containing the false positive rate for the ROC curve for each outer fold
    truePositiveRate: a list containing the true positive rate for the ROC curve for each outer fold
  '''
  # Plot the No Skill line
  plt.plot([0, 1], [0, 1], linestyle='--', label='No Skill')

  # Iterate over each outer fold and print out the results
  for i in range(0, 3):
    print("--------------------------------------------")
    print("Outer Fold " + str(i + 1) + " Result")
    print("--------------------------------------------")
    print("Feature Subset: ", featureSubset[i])
    print("Best n_estimator: ", param_1[i])
    print("Best max_depth: ", param_2[i])
    print("Train Score: ", trainScore[i])
    print("Test Score: ", testScore[i])
    print("AUC Score: ", aucScore[i])

  # Iterate over each outer fold and plot the ROC curve for that fold
  for i in range(0, 3):
    plt.plot(falsePositiveRate[i], truePositiveRate[i], marker='.', label = clf_name + ' - Outer Fold ' + str(i+1))

  # Add axis labels, legend, and display the graph
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  plt.legend()
  plt.show()

Result¶

Random Forest¶

In [14]:
# Global variables
# These lines of code are initializing several empty lists 
# which will be used later in the program to store data or results
# Resetting global variable data for other classifiers
param_1 = []
param_2 = []
trainScore = []
testScore = []
falsePositiveRate = []
truePositiveRate = []
aucScore = []
featureSubset = []

Evaluating Outer Fold 1¶


In [15]:
# Classifier Name
clf_name = 'Random Forest'
# Classifier's class name
clf = RandomForestClassifier
# Argument 1 name for classifier 
clf_arguments1 = 'n_estimators'
# Argument 2 name for classifier
clf_arguments2 = 'max_depth'
# Values of argument 1 parameter list for classifier
params1 = [[5, 10, 15], [40, 25, 30], [35, 20, 45], [50, 55, 60], [90, 100, 180]]
# Values of argument 2 parameter list for classifier
params2 = [[1, 2], [2, 3], [3, 4], [4, 6], [5, 7]]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_1, label_1)
Iteration 1 >> Feature: 36; n_estimators: 180; max_depth: 5; Train Accuracy: 0.7786; Test Accuracy: 0.6877; Selected Features: [36]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 2 >> Feature: 63; n_estimators: 55; max_depth: 4; Train Accuracy: 0.7722; Test Accuracy: 0.6409; Selected Features: [36, 63]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 3 >> Feature: 5; n_estimators: 20; max_depth: 3; Train Accuracy: 0.7018; Test Accuracy: 0.6302; Selected Features: [36, 63, 5]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 4 >> Feature: 48; n_estimators: 40; max_depth: 3; Train Accuracy: 0.7149; Test Accuracy: 0.6143; Selected Features: [36, 63, 5, 48]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 5 >> Feature: 45; n_estimators: 10; max_depth: 2; Train Accuracy: 0.6888; Test Accuracy: 0.62; Selected Features: [36, 63, 5, 48, 45]
---------------------------------------------------------------------------------------------------------------------------------
In [20]:
# Values of argument 1 parameter list for classifier
param1 = 180
# Values of argument 2 parameter list for classifier
param2 = 5

evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_1, label_1, outerFold_features_3, outerFold_labels_3, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 96.3542
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 51.546400000000006
--------------------------------------------------------------------------------------------------
n_estimators: 180
max_depth: 5
Features:  [82, 72, 27, 99, 77]
--------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0   0.493151  0.782609  0.605042        46
           1   0.583333  0.274510  0.373333        51

    accuracy                       0.515464        97
   macro avg   0.538242  0.528559  0.489188        97
weighted avg   0.540566  0.515464  0.483216        97

--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
Random Forest: ROC AUC=0.560
--------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------
Confusion Matrix
--------------------------------------------------------------------------------------------------

Evaluating Outer Fold 2¶


In [17]:
# Values of argument 1 parameter list for classifier
params1 = [[3, 5, 15], [40, 25, 30], [35, 20, 45], [10, 55, 60], [90, 100, 180]]
# Values of argument 2 parameter list for classifier
params2 = [[1, 2], [2, 3], [3, 4], [4, 6], [5, 7]]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_2, label_2)
Iteration 1 >> Feature: 75; n_estimators: 90; max_depth: 5; Train Accuracy: 0.8368; Test Accuracy: 0.6794; Selected Features: [75]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 2 >> Feature: 92; n_estimators: 55; max_depth: 4; Train Accuracy: 0.7824; Test Accuracy: 0.6629; Selected Features: [75, 92]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 3 >> Feature: 81; n_estimators: 20; max_depth: 3; Train Accuracy: 0.7306; Test Accuracy: 0.674; Selected Features: [75, 92, 81]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 4 >> Feature: 23; n_estimators: 25; max_depth: 3; Train Accuracy: 0.7344; Test Accuracy: 0.6528; Selected Features: [75, 92, 81, 23]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 5 >> Feature: 91; n_estimators: 3; max_depth: 1; Train Accuracy: 0.6373; Test Accuracy: 0.6267; Selected Features: [75, 92, 81, 23, 91]
---------------------------------------------------------------------------------------------------------------------------------
In [21]:
# Values of argument 1 parameter list for classifier
param1 = 20
# Values of argument 2 parameter list for classifier
param2 = 3

evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_2, label_2, outerFold_features_2, outerFold_labels_2, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 79.27459999999999
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 58.3333
--------------------------------------------------------------------------------------------------
n_estimators: 20
max_depth: 3
Features:  [82, 72, 27, 99, 77]
--------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0   0.591837  0.591837  0.591837        49
           1   0.574468  0.574468  0.574468        47

    accuracy                       0.583333        96
   macro avg   0.583152  0.583152  0.583152        96
weighted avg   0.583333  0.583333  0.583333        96

--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
Random Forest: ROC AUC=0.570
--------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------
Confusion Matrix
--------------------------------------------------------------------------------------------------

Evaluating Outer Fold 3¶


In [19]:
# Values of argument 1 parameter list for classifier
params1 = [[3, 5, 15], [40, 25, 30], [70, 20, 45], [10, 55, 60], [90, 150, 200]]
# Values of argument 2 parameter list for classifier
params2 = [[1, 2], [2, 3], [3, 4], [4, 6], [5, 7]]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_3, label_3)
Iteration 1 >> Feature: 82; n_estimators: 150; max_depth: 7; Train Accuracy: 0.9223; Test Accuracy: 0.6795; Selected Features: [82]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 2 >> Feature: 72; n_estimators: 60; max_depth: 6; Train Accuracy: 0.877; Test Accuracy: 0.6579; Selected Features: [82, 72]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 3 >> Feature: 27; n_estimators: 70; max_depth: 4; Train Accuracy: 0.7967; Test Accuracy: 0.6483; Selected Features: [82, 72, 27]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 4 >> Feature: 99; n_estimators: 40; max_depth: 3; Train Accuracy: 0.6917; Test Accuracy: 0.6117; Selected Features: [82, 72, 27, 99]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 5 >> Feature: 77; n_estimators: 15; max_depth: 1; Train Accuracy: 0.6282; Test Accuracy: 0.6219; Selected Features: [82, 72, 27, 99, 77]
---------------------------------------------------------------------------------------------------------------------------------
In [22]:
# Values of argument 1 parameter list for classifier
param1 = 40
# Values of argument 2 parameter list for classifier
param2 = 3

evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_3, label_3, outerFold_features_1, outerFold_labels_1, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 77.2021
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 53.125
--------------------------------------------------------------------------------------------------
n_estimators: 40
max_depth: 3
Features:  [82, 72, 27, 99, 77]
--------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0   0.566038  0.576923  0.571429        52
           1   0.488372  0.477273  0.482759        44

    accuracy                       0.531250        96
   macro avg   0.527205  0.527098  0.527094        96
weighted avg   0.530441  0.531250  0.530788        96

--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
Random Forest: ROC AUC=0.579
--------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------
Confusion Matrix
--------------------------------------------------------------------------------------------------

Comparison among Outer Fold Results¶

In [23]:
combinedROCPlot(clf_name, featureSubset, param_1, param_2, trainScore, testScore, aucScore, falsePositiveRate, truePositiveRate)
--------------------------------------------
Outer Fold 1 Result
--------------------------------------------
Feature Subset:  [36, 63, 5, 48, 45]
Best n_estimator:  5
Best max_depth:  5
Train Score:  82.8125
Test Score:  40.206199999999995
AUC Score:  0.45417732310315434
--------------------------------------------
Outer Fold 2 Result
--------------------------------------------
Feature Subset:  [75, 92, 81, 23, 91]
Best n_estimator:  5
Best max_depth:  5
Train Score:  88.0829
Test Score:  51.0417
AUC Score:  0.4982631350412505
--------------------------------------------
Outer Fold 3 Result
--------------------------------------------
Feature Subset:  [82, 72, 27, 99, 77]
Best n_estimator:  180
Best max_depth:  5
Train Score:  96.3542
Test Score:  51.546400000000006
AUC Score:  0.5601023017902813
In [24]:
listB = [82, 72, 27, 99, 77]
#listB = [42, 7, 28, 32, 20]

res = list(map(selectedFeatures.__getitem__, listB))
res.append('BlcaGrade')
sns.pairplot(data[res], hue='BlcaGrade', palette='tab10')
Out[24]:
<seaborn.axisgrid.PairGrid at 0x7f7156aff100>

SVM¶

In [ ]:
# Global variables
# These lines of code are initializing several empty lists 
# which will be used later in the program to store data or results
# Resetting global variable data for other classifiers
param_1 = []
param_2 = []
trainScore = []
testScore = []
falsePositiveRate = []
truePositiveRate = []
aucScore = []
featureSubset = []

Evaluating Outer Fold 1¶


In [29]:
# Classifier Name
clf_name = 'SVM'
# Classifier's class name
clf = SVC
# Argument 1 name for classifier 
clf_arguments1 = 'C'
# Argument 2 name for classifier
clf_arguments2 = 'kernel'
# Values of argument 1 parameter list for classifier
params1 = [[0.001, 0.01, 0.02], [0.01, 0.1, 0.003], [0.001, 0.1, 0.2], [1, 0.002, 0.003], [0.01, 0.2, 0.3]]
# Values of argument 2 parameter list for classifier
params2 = [['rbf'], ['rbf'], ['rbf'], ['rbf'], ['rbf']]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_1, label_1)
Iteration 1 >> Feature: 41; C: 0.3; kernel: rbf; Train Accuracy: 0.6081; Test Accuracy: 0.6042; Selected Features: [41]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 2 >> Feature: 45; C: 1; kernel: rbf; Train Accuracy: 0.6276; Test Accuracy: 0.5885; Selected Features: [41, 45]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 3 >> Feature: 74; C: 0.2; kernel: rbf; Train Accuracy: 0.6003; Test Accuracy: 0.5941; Selected Features: [41, 45, 74]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 4 >> Feature: 42; C: 0.1; kernel: rbf; Train Accuracy: 0.5404; Test Accuracy: 0.5364; Selected Features: [41, 45, 74, 42]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 5 >> Feature: 100; C: 0.02; kernel: rbf; Train Accuracy: 0.526; Test Accuracy: 0.526; Selected Features: [41, 45, 74, 42, 100]
---------------------------------------------------------------------------------------------------------------------------------
In [30]:
# Values of argument 1 parameter list for classifier
param1 = 0.3
# Values of argument 2 parameter list for classifier
param2 = 'rbf'

evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_1, label_1, outerFold_features_3, outerFold_labels_3, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 63.5417
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 49.4845
--------------------------------------------------------------------------------------------------
C: 0.3
kernel: rbf
Features:  [41, 45, 74, 42, 100]
--------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0   0.477612  0.695652  0.566372        46
           1   0.533333  0.313725  0.395062        51

    accuracy                       0.494845        97
   macro avg   0.505473  0.504689  0.480717        97
weighted avg   0.506909  0.494845  0.476301        97

--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
SVM: ROC AUC=0.541
--------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------
Confusion Matrix
--------------------------------------------------------------------------------------------------

Evaluating Outer Fold 2¶


In [31]:
# Values of argument 1 parameter list for classifier
params1 = [[0.001, 0.1, 0.2], [0.001, 0.1, 0.01], [0.001, 0.1, 1], [0.001, 0.1, 0.2], [0.001, 0.1, 0.3]]
# Values of argument 2 parameter list for classifier
params2 = [['rbf'], ['rbf'], ['rbf'], ['rbf'], ['rbf']]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_2, label_2)
Iteration 1 >> Feature: 81; C: 0.1; kernel: rbf; Train Accuracy: 0.6425; Test Accuracy: 0.6426; Selected Features: [81]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 2 >> Feature: 14; C: 0.2; kernel: rbf; Train Accuracy: 0.5803; Test Accuracy: 0.6066; Selected Features: [81, 14]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 3 >> Feature: 91; C: 1; kernel: rbf; Train Accuracy: 0.6205; Test Accuracy: 0.6166; Selected Features: [81, 14, 91]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 4 >> Feature: 73; C: 0.1; kernel: rbf; Train Accuracy: 0.5985; Test Accuracy: 0.5914; Selected Features: [81, 14, 91, 73]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 5 >> Feature: 63; C: 0.2; kernel: rbf; Train Accuracy: 0.6153; Test Accuracy: 0.5907; Selected Features: [81, 14, 91, 73, 63]
---------------------------------------------------------------------------------------------------------------------------------
In [33]:
# Values of argument 1 parameter list for classifier
param1 = 0.2
# Values of argument 2 parameter list for classifier
param2 = 'rbf'

evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_2, label_2, outerFold_features_2, outerFold_labels_2, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 63.7306
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 41.6667
--------------------------------------------------------------------------------------------------
C: 0.2
kernel: rbf
Features:  [81, 14, 91, 73, 63]
--------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0   0.400000  0.285714  0.333333        49
           1   0.426230  0.553191  0.481481        47

    accuracy                       0.416667        96
   macro avg   0.413115  0.419453  0.407407        96
weighted avg   0.412842  0.416667  0.405864        96

--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
SVM: ROC AUC=0.451
--------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------
Confusion Matrix
--------------------------------------------------------------------------------------------------

Evaluating Outer Fold 3¶


In [34]:
# Values of argument 1 parameter list for classifier
params1 = [[0.01,0.002], [0.03,0.1], [0.02,0.01], [0.001,0.3], [0.3,0.002]]
# Values of argument 2 parameter list for classifier
params2 = [['rbf'], ['rbf'], ['rbf'], ['rbf'], ['rbf']]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_3, label_3)
Iteration 1 >> Feature: 73; C: 0.3; kernel: rbf; Train Accuracy: 0.6062; Test Accuracy: 0.5868; Selected Features: [73]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 2 >> Feature: 77; C: 0.3; kernel: rbf; Train Accuracy: 0.6088; Test Accuracy: 0.5857; Selected Features: [73, 77]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 3 >> Feature: 100; C: 0.01; kernel: rbf; Train Accuracy: 0.5078; Test Accuracy: 0.5077; Selected Features: [73, 77, 100]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 4 >> Feature: 4; C: 0.1; kernel: rbf; Train Accuracy: 0.6127; Test Accuracy: 0.5957; Selected Features: [73, 77, 100, 4]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 5 >> Feature: 99; C: 0.002; kernel: rbf; Train Accuracy: 0.5078; Test Accuracy: 0.5077; Selected Features: [73, 77, 100, 4, 99]
---------------------------------------------------------------------------------------------------------------------------------
In [36]:
# Values of argument 1 parameter list for classifier
param1 = 0.002
# Values of argument 2 parameter list for classifier
param2 = 'rbf'

evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_3, label_3, outerFold_features_1, outerFold_labels_1, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 50.7772
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 45.8333
--------------------------------------------------------------------------------------------------
C: 0.002
kernel: rbf
Features:  [73, 77, 100, 4, 99]
--------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0   0.000000  0.000000  0.000000        52
           1   0.458333  1.000000  0.628571        44

    accuracy                       0.458333        96
   macro avg   0.229167  0.500000  0.314286        96
weighted avg   0.210069  0.458333  0.288095        96

--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
SVM: ROC AUC=0.393
--------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------
Confusion Matrix
--------------------------------------------------------------------------------------------------

Comparison among Outer Fold Results¶

In [37]:
combinedROCPlot(clf_name, featureSubset, param_1, param_2, trainScore, testScore, aucScore, falsePositiveRate, truePositiveRate)
--------------------------------------------
Outer Fold 1 Result
--------------------------------------------
Feature Subset:  [36, 63, 5, 48, 45]
Best n_estimator:  5
Best max_depth:  5
Train Score:  82.8125
Test Score:  40.206199999999995
AUC Score:  0.45417732310315434
--------------------------------------------
Outer Fold 2 Result
--------------------------------------------
Feature Subset:  [75, 92, 81, 23, 91]
Best n_estimator:  5
Best max_depth:  5
Train Score:  88.0829
Test Score:  51.0417
AUC Score:  0.4982631350412505
--------------------------------------------
Outer Fold 3 Result
--------------------------------------------
Feature Subset:  [82, 72, 27, 99, 77]
Best n_estimator:  180
Best max_depth:  5
Train Score:  96.3542
Test Score:  51.546400000000006
AUC Score:  0.5601023017902813
In [ ]:
listB = [78, 68, 48, 59, 61]

res = list(map(selectedFeatures.__getitem__, listB))
res.append('BlcaGrade')
sns.pairplot(data[res], hue='BlcaGrade', palette='tab10')

Gradient Boosting¶

In [92]:
# Global variables
# These lines of code are initializing several empty lists 
# which will be used later in the program to store data or results
# Resetting global variable data for other classifiers
param_1 = []
param_2 = []
trainScore = []
testScore = []
falsePositiveRate = []
truePositiveRate = []
aucScore = []
featureSubset = []

Evaluating Outer Fold 1¶


In [93]:
# Classifier Name
clf_name = 'Gradient Boosting'
# Classifier's class name
clf = GradientBoostingClassifier
# Argument 1 name for classifier 
clf_arguments1 = 'n_estimators'
# Argument 2 name for classifier
clf_arguments2 = 'max_depth'
# Values of argument 1 parameter list for classifier
params1 = [[2, 3], [5, 25], [30, 35], [50, 60], [70, 100]]
# Values of argument 2 parameter list for classifier
params2 = [[1, 2], [2, 3], [3, 4], [4, 7], [5, 6]]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_1, label_1)
Iteration 1 >> Feature: 44; n_estimators: 100; max_depth: 6; Train Accuracy: 1.0; Test Accuracy: 0.6354; Selected Features: [44]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 2 >> Feature: 69; n_estimators: 50; max_depth: 4; Train Accuracy: 0.9466; Test Accuracy: 0.6358; Selected Features: [44, 69]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 3 >> Feature: 31; n_estimators: 35; max_depth: 4; Train Accuracy: 0.9232; Test Accuracy: 0.6402; Selected Features: [44, 69, 31]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 4 >> Feature: 4; n_estimators: 5; max_depth: 3; Train Accuracy: 0.7122; Test Accuracy: 0.6301; Selected Features: [44, 69, 31, 4]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 5 >> Feature: 72; n_estimators: 3; max_depth: 2; Train Accuracy: 0.6523; Test Accuracy: 0.5779; Selected Features: [44, 69, 31, 4, 72]
---------------------------------------------------------------------------------------------------------------------------------
In [94]:
# Values of argument 1 parameter list for classifier
param1 = 5
# Values of argument 2 parameter list for classifier
param2 = 3

evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_1, label_1, outerFold_features_3, outerFold_labels_3, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 75.5208
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 54.6392
--------------------------------------------------------------------------------------------------
n_estimators: 5
max_depth: 3
Features:  [44, 69, 31, 4, 72]
--------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0   0.513889  0.804348  0.627119        46
           1   0.640000  0.313725  0.421053        51

    accuracy                       0.546392        97
   macro avg   0.576944  0.559037  0.524086        97
weighted avg   0.580195  0.546392  0.518775        97

--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
Gradient Boosting: ROC AUC=0.607
--------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------
Confusion Matrix
--------------------------------------------------------------------------------------------------

Evaluating Outer Fold 2¶


In [95]:
# Values of argument 1 parameter list for classifier
params1 = [[5, 2], [3, 10], [15, 35], [20, 90], [80, 60]]
# Values of argument 2 parameter list for classifier
params2 = [[3, 5], [7, 4], [6, 3], [1, 6], [5, 6]]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_2, label_2)
Iteration 1 >> Feature: 48; n_estimators: 60; max_depth: 6; Train Accuracy: 1.0; Test Accuracy: 0.6592; Selected Features: [48]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 2 >> Feature: 81; n_estimators: 20; max_depth: 1; Train Accuracy: 0.6593; Test Accuracy: 0.6479; Selected Features: [48, 81]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 3 >> Feature: 13; n_estimators: 35; max_depth: 3; Train Accuracy: 0.8368; Test Accuracy: 0.6474; Selected Features: [48, 81, 13]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 4 >> Feature: 91; n_estimators: 10; max_depth: 4; Train Accuracy: 0.7513; Test Accuracy: 0.653; Selected Features: [48, 81, 13, 91]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 5 >> Feature: 56; n_estimators: 2; max_depth: 5; Train Accuracy: 0.6891; Test Accuracy: 0.6173; Selected Features: [48, 81, 13, 91, 56]
---------------------------------------------------------------------------------------------------------------------------------
In [96]:
# Values of argument 1 parameter list for classifier
param1 = 2
# Values of argument 2 parameter list for classifier
param2 = 5

evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_2, label_2, outerFold_features_2, outerFold_labels_2, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 86.0104
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 38.5417
--------------------------------------------------------------------------------------------------
n_estimators: 2
max_depth: 5
Features:  [48, 81, 13, 91, 56]
--------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0   0.407407  0.448980  0.427184        49
           1   0.357143  0.319149  0.337079        47

    accuracy                       0.385417        96
   macro avg   0.382275  0.384064  0.382132        96
weighted avg   0.382799  0.385417  0.383070        96

--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
Gradient Boosting: ROC AUC=0.403
--------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------
Confusion Matrix
--------------------------------------------------------------------------------------------------

Evaluating Outer Fold 3¶


In [97]:
# Values of argument 1 parameter list for classifier
params1 = [[3, 5], [10, 25], [7, 50], [70, 60], [150, 200]]
# Values of argument 2 parameter list for classifier
params2 = [[2, 4], [3, 4], [3, 7], [4, 8], [5, 6]]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_3, label_3)
Iteration 1 >> Feature: 82; n_estimators: 200; max_depth: 6; Train Accuracy: 0.9909; Test Accuracy: 0.6688; Selected Features: [82]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 2 >> Feature: 51; n_estimators: 60; max_depth: 4; Train Accuracy: 0.965; Test Accuracy: 0.643; Selected Features: [82, 51]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 3 >> Feature: 70; n_estimators: 50; max_depth: 3; Train Accuracy: 0.8809; Test Accuracy: 0.6426; Selected Features: [82, 51, 70]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 4 >> Feature: 95; n_estimators: 10; max_depth: 4; Train Accuracy: 0.8239; Test Accuracy: 0.6641; Selected Features: [82, 51, 70, 95]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 5 >> Feature: 27; n_estimators: 5; max_depth: 2; Train Accuracy: 0.6399; Test Accuracy: 0.617; Selected Features: [82, 51, 70, 95, 27]
---------------------------------------------------------------------------------------------------------------------------------
In [98]:
# Values of argument 1 parameter list for classifier
param1 = 5
# Values of argument 2 parameter list for classifier
param2 = 2

evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_3, label_3, outerFold_features_1, outerFold_labels_1, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 67.8756
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 53.125
--------------------------------------------------------------------------------------------------
n_estimators: 5
max_depth: 2
Features:  [82, 51, 70, 95, 27]
--------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0   0.577778  0.500000  0.536082        52
           1   0.490196  0.568182  0.526316        44

    accuracy                       0.531250        96
   macro avg   0.533987  0.534091  0.531199        96
weighted avg   0.537636  0.531250  0.531606        96

--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
Gradient Boosting: ROC AUC=0.534
--------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------
Confusion Matrix
--------------------------------------------------------------------------------------------------

Comparison among Outer Fold Results¶

In [99]:
combinedROCPlot(clf_name, featureSubset, param_1, param_2, trainScore, testScore, aucScore, falsePositiveRate, truePositiveRate)
--------------------------------------------
Outer Fold 1 Result
--------------------------------------------
Feature Subset:  [44, 69, 31, 4, 72]
Best n_estimator:  5
Best max_depth:  3
Train Score:  75.5208
Test Score:  54.6392
AUC Score:  0.6072037510656436
--------------------------------------------
Outer Fold 2 Result
--------------------------------------------
Feature Subset:  [48, 81, 13, 91, 56]
Best n_estimator:  2
Best max_depth:  5
Train Score:  86.0104
Test Score:  38.5417
AUC Score:  0.4029526704298741
--------------------------------------------
Outer Fold 3 Result
--------------------------------------------
Feature Subset:  [82, 51, 70, 95, 27]
Best n_estimator:  5
Best max_depth:  2
Train Score:  67.8756
Test Score:  53.125
AUC Score:  0.5336538461538463
In [100]:
listB = [7, 95, 81, 28, 76]

res = list(map(selectedFeatures.__getitem__, listB))
res.append('BlcaGrade')
sns.pairplot(data[res], hue='BlcaGrade', palette='tab10')
Out[100]:
<seaborn.axisgrid.PairGrid at 0x7f713d03d570>

KNN¶

In [124]:
# Global variables
# These lines of code are initializing several empty lists 
# which will be used later in the program to store data or results
# Resetting global variable data for other classifiers
param_1 = []
param_2 = []
trainScore = []
testScore = []
falsePositiveRate = []
truePositiveRate = []
aucScore = []
featureSubset = []

Evaluating Outer Fold 1¶


In [125]:
# Classifier Name
clf_name = 'KNN'
# Classifier's class name
clf = KNeighborsClassifier
# Argument 1 name for classifier 
clf_arguments1 = 'n_neighbors'
# Argument 2 name for classifier
clf_arguments2 = 'weights'
# Values of argument 1 parameter list for classifier
params1 = [[2, 3], [3, 4], [4, 5], [5, 6], [6, 7]]
# Values of argument 2 parameter list for classifier
params2 = [['uniform', 'distance'], ['uniform', 'distance'], ['uniform', 'distance'], ['uniform', 'distance'], ['uniform', 'distance']]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_1, label_1)
Iteration 1 >> Feature: 71; n_neighbors: 7; weights: distance; Train Accuracy: 1.0; Test Accuracy: 0.6775; Selected Features: [71]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 2 >> Feature: 44; n_neighbors: 5; weights: distance; Train Accuracy: 1.0; Test Accuracy: 0.6565; Selected Features: [71, 44]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 3 >> Feature: 6; n_neighbors: 5; weights: distance; Train Accuracy: 1.0; Test Accuracy: 0.6451; Selected Features: [71, 44, 6]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 4 >> Feature: 48; n_neighbors: 3; weights: uniform; Train Accuracy: 0.7357; Test Accuracy: 0.6614; Selected Features: [71, 44, 6, 48]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 5 >> Feature: 70; n_neighbors: 2; weights: uniform; Train Accuracy: 0.8099; Test Accuracy: 0.6405; Selected Features: [71, 44, 6, 48, 70]
---------------------------------------------------------------------------------------------------------------------------------
In [126]:
# Values of argument 1 parameter list for classifier
param1 = 3
# Values of argument 2 parameter list for classifier
param2 = 'uniform'

evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_1, label_1, outerFold_features_3, outerFold_labels_3, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 77.0833
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 41.2371
--------------------------------------------------------------------------------------------------
n_neighbors: 3
weights: uniform
Features:  [71, 44, 6, 48, 70]
--------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0   0.400000  0.478261  0.435644        46
           1   0.428571  0.352941  0.387097        51

    accuracy                       0.412371        97
   macro avg   0.414286  0.415601  0.411370        97
weighted avg   0.415022  0.412371  0.410119        97

--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
KNN: ROC AUC=0.431
--------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------
Confusion Matrix
--------------------------------------------------------------------------------------------------

Evaluating Outer Fold 2¶


In [127]:
# Values of argument 1 parameter list for classifier
params1 = [[2, 3], [3, 4], [4, 5], [5, 6], [6, 7]]
# Values of argument 2 parameter list for classifier
params2 = [['uniform', 'distance'], ['uniform', 'distance'], ['uniform', 'distance'], ['uniform', 'distance'], ['uniform', 'distance']]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_2, label_2)
Iteration 1 >> Feature: 81; n_neighbors: 7; weights: distance; Train Accuracy: 1.0; Test Accuracy: 0.6784; Selected Features: [81]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 2 >> Feature: 5; n_neighbors: 5; weights: uniform; Train Accuracy: 0.7461; Test Accuracy: 0.6734; Selected Features: [81, 5]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 3 >> Feature: 13; n_neighbors: 5; weights: distance; Train Accuracy: 1.0; Test Accuracy: 0.6629; Selected Features: [81, 5, 13]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 4 >> Feature: 49; n_neighbors: 3; weights: distance; Train Accuracy: 1.0; Test Accuracy: 0.649; Selected Features: [81, 5, 13, 49]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 5 >> Feature: 43; n_neighbors: 2; weights: uniform; Train Accuracy: 0.7927; Test Accuracy: 0.6521; Selected Features: [81, 5, 13, 49, 43]
---------------------------------------------------------------------------------------------------------------------------------
In [128]:
# Values of argument 1 parameter list for classifier
param1 = 2
# Values of argument 2 parameter list for classifier
param2 = 'uniform'

evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_2, label_2, outerFold_features_2, outerFold_labels_2, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 81.86529999999999
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 50.0
--------------------------------------------------------------------------------------------------
n_neighbors: 2
weights: uniform
Features:  [81, 5, 13, 49, 43]
--------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0   0.507246  0.714286  0.593220        49
           1   0.481481  0.276596  0.351351        47

    accuracy                       0.500000        96
   macro avg   0.494364  0.495441  0.472286        96
weighted avg   0.494632  0.500000  0.474805        96

--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
KNN: ROC AUC=0.507
--------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------
Confusion Matrix
--------------------------------------------------------------------------------------------------

Evaluating Outer Fold 3¶


In [129]:
# Values of argument 1 parameter list for classifier
params1 = [[2, 3], [3, 4], [4, 5], [5, 1], [6, 7]]
# Values of argument 2 parameter list for classifier
params2 = [['uniform', 'distance'], ['uniform', 'distance'], ['uniform', 'distance'], ['uniform', 'distance'], ['uniform', 'distance']]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_3, label_3)
Iteration 1 >> Feature: 71; n_neighbors: 6; weights: distance; Train Accuracy: 1.0; Test Accuracy: 0.6478; Selected Features: [71]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 2 >> Feature: 82; n_neighbors: 1; weights: distance; Train Accuracy: 0.9845; Test Accuracy: 0.6843; Selected Features: [71, 82]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 3 >> Feature: 51; n_neighbors: 5; weights: distance; Train Accuracy: 1.0; Test Accuracy: 0.6576; Selected Features: [71, 82, 51]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 4 >> Feature: 74; n_neighbors: 3; weights: uniform; Train Accuracy: 0.7669; Test Accuracy: 0.6533; Selected Features: [71, 82, 51, 74]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 5 >> Feature: 26; n_neighbors: 3; weights: uniform; Train Accuracy: 0.7889; Test Accuracy: 0.6482; Selected Features: [71, 82, 51, 74, 26]
---------------------------------------------------------------------------------------------------------------------------------
In [130]:
# Values of argument 1 parameter list for classifier
param1 = 3
# Values of argument 2 parameter list for classifier
param2 = 'uniform'

evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_3, label_3, outerFold_features_1, outerFold_labels_1, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 72.5389
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 57.2917
--------------------------------------------------------------------------------------------------
n_neighbors: 3
weights: uniform
Features:  [71, 82, 51, 74, 26]
--------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0   0.600000  0.634615  0.616822        52
           1   0.536585  0.500000  0.517647        44

    accuracy                       0.572917        96
   macro avg   0.568293  0.567308  0.567235        96
weighted avg   0.570935  0.572917  0.571367        96

--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
KNN: ROC AUC=0.545
--------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------
Confusion Matrix
--------------------------------------------------------------------------------------------------

Comparison among Outer Fold Results¶

In [131]:
combinedROCPlot(clf_name, featureSubset, param_1, param_2, trainScore, testScore, aucScore, falsePositiveRate, truePositiveRate)
--------------------------------------------
Outer Fold 1 Result
--------------------------------------------
Feature Subset:  [71, 44, 6, 48, 70]
Best n_estimator:  3
Best max_depth:  uniform
Train Score:  77.0833
Test Score:  41.2371
AUC Score:  0.43137254901960786
--------------------------------------------
Outer Fold 2 Result
--------------------------------------------
Feature Subset:  [81, 5, 13, 49, 43]
Best n_estimator:  2
Best max_depth:  uniform
Train Score:  81.86529999999999
Test Score:  50.0
AUC Score:  0.5065132435953105
--------------------------------------------
Outer Fold 3 Result
--------------------------------------------
Feature Subset:  [71, 82, 51, 74, 26]
Best n_estimator:  3
Best max_depth:  uniform
Train Score:  72.5389
Test Score:  57.2917
AUC Score:  0.5454545454545454
In [ ]:
listB = [66, 33, 84, 55, 99]

res = list(map(selectedFeatures.__getitem__, listB))
res.append('BlcaGrade')
sns.pairplot(data[res], hue='BlcaGrade', palette='tab10')

Logistic Regression¶

In [132]:
# Global variables
# These lines of code are initializing several empty lists 
# which will be used later in the program to store data or results
# Resetting global variable data for other classifiers
param_1 = []
param_2 = []
trainScore = []
testScore = []
falsePositiveRate = []
truePositiveRate = []
aucScore = []
featureSubset = []

Evaluating Outer Fold 1¶


In [133]:
# Classifier Name
clf_name = 'Logistic Regression'
# Classifier's class name
clf = LogisticRegression
# Argument 1 name for classifier 
clf_arguments1 = 'solver'
# Argument 2 name for classifier
clf_arguments2 = 'penalty'
# Values of argument 1 parameter list for classifier
params1 =params1 = [['lbfgs', 'liblinear'], ['lbfgs', 'liblinear'], ['lbfgs', 'liblinear'], ['lbfgs', 'liblinear'], ['lbfgs', 'liblinear']]
# Values of argument 2 parameter list for classifier
params2 = [['l2'], ['l2'], ['l2'], [ 'l2'], ['l2']]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_1, label_1)
Iteration 1 >> Feature: 74; solver: liblinear; penalty: l2; Train Accuracy: 0.5756; Test Accuracy: 0.5733; Selected Features: [74]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 2 >> Feature: 73; solver: liblinear; penalty: l2; Train Accuracy: 0.5638; Test Accuracy: 0.5628; Selected Features: [74, 73]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 3 >> Feature: 4; solver: liblinear; penalty: l2; Train Accuracy: 0.5664; Test Accuracy: 0.5625; Selected Features: [74, 73, 4]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 4 >> Feature: 88; solver: liblinear; penalty: l2; Train Accuracy: 0.5677; Test Accuracy: 0.5578; Selected Features: [74, 73, 4, 88]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 5 >> Feature: 38; solver: liblinear; penalty: l2; Train Accuracy: 0.5443; Test Accuracy: 0.5578; Selected Features: [74, 73, 4, 88, 38]
---------------------------------------------------------------------------------------------------------------------------------
In [134]:
# Values of argument 1 parameter list for classifier
param1 = 'liblinear'
# Values of argument 2 parameter list for classifier
param2 = 'l2'

evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_1, label_1, outerFold_features_3, outerFold_labels_3, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 60.4167
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 46.391799999999996
--------------------------------------------------------------------------------------------------
solver: liblinear
penalty: l2
Features:  [74, 73, 4, 88, 38]
--------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0   0.455882  0.673913  0.543860        46
           1   0.482759  0.274510  0.350000        51

    accuracy                       0.463918        97
   macro avg   0.469320  0.474211  0.446930        97
weighted avg   0.470013  0.463918  0.441933        97

--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
Logistic Regression: ROC AUC=0.442
--------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------
Confusion Matrix
--------------------------------------------------------------------------------------------------

Evaluating Outer Fold 2¶


In [135]:
# Values of argument 1 parameter list for classifier
params1 =   [['lbfgs', 'liblinear'], ['lbfgs', 'sag'], ['lbfgs', 'sag'], ['lbfgs', 'sag'], ['lbfgs', 'sag']]
# Values of argument 2 parameter list for classifier
params2 = [['l2'], ['l2'], ['l2'], ['l2'], ['l2']]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_2, label_2)
Iteration 1 >> Feature: 81; solver: lbfgs; penalty: l2; Train Accuracy: 0.6438; Test Accuracy: 0.6217; Selected Features: [81]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 2 >> Feature: 72; solver: lbfgs; penalty: l2; Train Accuracy: 0.5933; Test Accuracy: 0.5914; Selected Features: [81, 72]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 3 >> Feature: 67; solver: lbfgs; penalty: l2; Train Accuracy: 0.5713; Test Accuracy: 0.5806; Selected Features: [81, 72, 67]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 4 >> Feature: 57; solver: sag; penalty: l2; Train Accuracy: 0.5428; Test Accuracy: 0.5808; Selected Features: [81, 72, 67, 57]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 5 >> Feature: 63; solver: liblinear; penalty: l2; Train Accuracy: 0.592; Test Accuracy: 0.5806; Selected Features: [81, 72, 67, 57, 63]
---------------------------------------------------------------------------------------------------------------------------------
In [136]:
# Values of argument 1 parameter list for classifier
param1 = 'lbfgs'
# Values of argument 2 parameter list for classifier
param2 = 'l2'

evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_2, label_2, outerFold_features_2, outerFold_labels_2, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 61.658
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 43.75
--------------------------------------------------------------------------------------------------
solver: lbfgs
penalty: l2
Features:  [81, 72, 67, 57, 63]
--------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0   0.439024  0.367347  0.400000        49
           1   0.436364  0.510638  0.470588        47

    accuracy                       0.437500        96
   macro avg   0.437694  0.438993  0.435294        96
weighted avg   0.437722  0.437500  0.434559        96

--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
Logistic Regression: ROC AUC=0.414
--------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------
Confusion Matrix
--------------------------------------------------------------------------------------------------

Evaluating Outer Fold 3¶


In [137]:
# Values of argument 1 parameter list for classifier
params1 = [['lbfgs', 'liblinear'], ['lbfgs', 'liblinear'], ['lbfgs', 'liblinear'], ['lbfgs', 'liblinear'], ['lbfgs', 'liblinear']]
# Values of argument 2 parameter list for classifier
params2 = [['l2'], ['l2'], ['l2'], ['l2'], ['l2']]
# Empty list for FFS - Global Variable
featuresOuterFold = []
# Invoking FFS
outerFold(clf, clf_arguments1, clf_arguments2, params1, params2, features_3, label_3)
Iteration 1 >> Feature: 76; solver: liblinear; penalty: l2; Train Accuracy: 0.5959; Test Accuracy: 0.6012; Selected Features: [76]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 2 >> Feature: 52; solver: liblinear; penalty: l2; Train Accuracy: 0.5881; Test Accuracy: 0.596; Selected Features: [76, 52]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 3 >> Feature: 3; solver: liblinear; penalty: l2; Train Accuracy: 0.5765; Test Accuracy: 0.5958; Selected Features: [76, 52, 3]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 4 >> Feature: 72; solver: lbfgs; penalty: l2; Train Accuracy: 0.5739; Test Accuracy: 0.5707; Selected Features: [76, 52, 3, 72]
---------------------------------------------------------------------------------------------------------------------------------
Iteration 5 >> Feature: 18; solver: liblinear; penalty: l2; Train Accuracy: 0.5635; Test Accuracy: 0.5703; Selected Features: [76, 52, 3, 72, 18]
---------------------------------------------------------------------------------------------------------------------------------
In [138]:
# Values of argument 1 parameter list for classifier
param1 = 'lbfgs'
# Values of argument 2 parameter list for classifier
param2 = 'l2'

evaluationOuterFold(clf_name, clf, clf_arguments1, clf_arguments2, param1, param2, featuresOuterFold, features_3, label_3, outerFold_features_1, outerFold_labels_1, label_names)
--------------------------------------------------------------------------------------------------
Training Score on outer fold: 60.6218
--------------------------------------------------------------------------------------------------
Test Accuracy on outer fold: 51.0417
--------------------------------------------------------------------------------------------------
solver: lbfgs
penalty: l2
Features:  [76, 52, 3, 72, 18]
--------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0   0.555556  0.480769  0.515464        52
           1   0.470588  0.545455  0.505263        44

    accuracy                       0.510417        96
   macro avg   0.513072  0.513112  0.510364        96
weighted avg   0.516612  0.510417  0.510789        96

--------------------------------------------------------------------------------------------------
No Skill: ROC AUC=0.500
Logistic Regression: ROC AUC=0.535
--------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------
Confusion Matrix
--------------------------------------------------------------------------------------------------

Comparison among Outer Fold Results¶

In [139]:
combinedROCPlot(clf_name, featureSubset, param_1, param_2, trainScore, testScore, aucScore, falsePositiveRate, truePositiveRate)
--------------------------------------------
Outer Fold 1 Result
--------------------------------------------
Feature Subset:  [74, 73, 4, 88, 38]
Best n_estimator:  liblinear
Best max_depth:  l2
Train Score:  60.4167
Test Score:  46.391799999999996
AUC Score:  0.44245524296675187
--------------------------------------------
Outer Fold 2 Result
--------------------------------------------
Feature Subset:  [81, 72, 67, 57, 63]
Best n_estimator:  lbfgs
Best max_depth:  l2
Train Score:  61.658
Test Score:  43.75
AUC Score:  0.4138080764220582
--------------------------------------------
Outer Fold 3 Result
--------------------------------------------
Feature Subset:  [76, 52, 3, 72, 18]
Best n_estimator:  lbfgs
Best max_depth:  l2
Train Score:  60.6218
Test Score:  51.0417
AUC Score:  0.534965034965035
In [140]:
listB = [78, 68, 48, 59, 61]

res = list(map(selectedFeatures.__getitem__, listB))
res.append('BlcaGrade')
sns.pairplot(data[res], hue='BlcaGrade', palette='tab10')
Out[140]:
<seaborn.axisgrid.PairGrid at 0x7f714c8d6290>
In [ ]: